Graphs

Relationship between Educational Attainment and GDP per Capita

Relationship between Educational Attainment and GDP per Capita by Continent

Relationship between Educational Attainment and Child Mortality Rate by Continent

Relationship between Educational Attainment and Number of Workers in Family

Timeline Graph 1

primary_school %>%
  group_by(continent, year) %>%
  summarize(average = mean(primary_school)) %>%
  ggplot(aes(x=year,y=average, color=continent)) + 
  geom_point() + 
  geom_line() + 
  labs(x="Year", y="Average Rate of Educational Attainment", 
       title = "Educational Attainment over Time by Continent")

Map of World Educational Attainment

world <- primary_school %>%
  distinct(country)

map.world <- map_data("world")

world %>% 
  ggplot() +
  geom_map(data = map.world, map = map.world, 
           aes(map_id = region), fill = "white", color = "black") + 
  geom_map(aes(map_id = country), map = map.world) +
  expand_limits(x = map.world$long, y = map.world$lat) + # scale for fill
  theme_map() 






worldmap <- get_stamenmap(
    bbox = c(left = -180, bottom = -57, right = 179, top = 82.1), 
    maptype = "toner-background",
    zoom = 2
)

ggmap(worldmap)

primary_school %>%
  filter(decade == 1990) %>%
  group_by(decade, country) %>%
  summarize(average = mean(child_mortality)) %>%
  arrange(desc(average)) %>%
  filter(country==c("Niger", "Costa Rica")) 
longer object length is not a multiple of shorter object length

Tables

Mean Educational Attainment by Continent

primary_school <- primary_school %>%
  mutate(high_gdp = gdp_capita > 3955) 
# scatterplots to find interactions between variables
ggplot(primary_school, aes(x=gdp_capita, y=unemployment, color=high_gdp)) + geom_point()



# linear regression
library(broom)
tidy(lm(primary_school ~ high_gdp + unemployment, data=primary_school))

tidy(lm(primary_school ~ gdp_capita + unemployment + child_mortality, data=primary_school))

tidy(lm(primary_school ~ gdp_capita + unemployment + rural_pop, data=primary_school))

# an interaction effect will let one variable affect the other. Doesn't mean that the two variables are related
# coeff 0 -> no relationship
# less variables to fit b/c NA

country_list <- primary_school %>% 
  select(1:1) %>%
  distinct() 

save(country_list,file="country_list.Rda") 
  
## Map of World Educational Attainment
mapdata <- primary_school %>%
  arrange(desc(year)) %>%
  drop_na(gdp_capita) %>%
  group_by(country) %>%
  top_n(1, wt=year)


map.world <- map_data("world")

mapdata %>% 
  ggplot() +
  geom_map(data = map.world, map = map.world, 
           aes(map_id = region), fill = "white", color = "black") + 
  geom_map(aes(map_id = country), map = map.world) +
  geom_map(map = map.world, 
           aes(map_id = country, fill = high_gdp), color = "black") + 
  expand_limits(x = map.world$long, y = map.world$lat) + # scale for fill
  theme_map() 



worldmap <- get_stamenmap(
    bbox = c(left = -180, bottom = -57, right = 179, top = 82.1), 
    maptype = "toner-background",
    zoom = 2
)
ggmap(worldmap)



# chloropleth style by gdp or use high vs low gdp variables
# Pull most recent year for countries --> get rid of years with missing values, arrange_desc(year), 
# 5-6 plots are reasonable 
# Quality over quantity 
primary_school %>%
  ggplot(aes(x=year, y=primary_school, color=high_gdp)) + 
  geom_jitter() +
  facet_wrap(~ high_gdp) +
  geom_smooth(se=FALSE) + 
  labs(x="Year", y="Educational Attainment", title = "Comparison of Educational Attainment between countries with high and low GDP per capita")

primary_school %>%
  ggplot(aes(x=year, y=primary_school, color=high_gdp)) + 
  geom_jitter() +
  facet_wrap(~ continent) +
  geom_smooth(se=FALSE) + 
  labs(x="Year", y="Educational Attainment", title = "Comparison of Educational Attainment between countries with high and low GDP per capita")

Table

primary_school %>%
  group_by(continent) %>%
  summarize(mean_gdp = mean(gdp_capita)) %>%
  arrange(desc(mean_gdp))
plot <- ggplot(primary_school, aes(x=gdp_capita, y=primary_school, color=high_gdp)) + 
  geom_point() + 
  geom_smooth(method = "lm", se = FALSE) +
  labs(x="GDP per capita", y="Educational Attainment (Primary School)", 
       title="Educational Attainment and GDP per Capita by Continent")

ggplotly(plot)

NA

##Schooling Cost Graph

ggplot(primary_school,aes(x=schooling_cost,y=primary_school, color=high_gdp))+
  geom_point()+
  geom_smooth(method="lm", se=FALSE)

Gini Ratio

ggplot(primary_school,aes(x=gini,y=primary_school, color=high_gdp))+
  geom_point()+
  geom_smooth(method="lm", se=FALSE)

---
title: "Data Science Final Project"
author: "Blair Cha, Kaarin Khandelwal, Dylan Larsen"
output:
  html_document:
    df_print: paged
  html_notebook: default
  pdf_document: default
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```

```{r, warning=FALSE, echo=FALSE, message = FALSE}
library(readxl)
library(dplyr)
library(tidyverse)
library(lubridate)
library(countrycode)
library(stringr)
library(wesanderson)
library(plotly)
library(ggmap)
library(ggthemes)
library(broom)

primary_school0 <- read_excel("primary_school.xlsx")
urban_poverty0 <- read_excel("urban_poverty.xlsx")
unemployment0 <- read_excel("unemployment.xlsx")
gdp_capita0 <- read_excel("gdp_capita.xlsx")
child_mortality0 <- read_excel("child_mortality.xlsx")
family_workers0 <- read_excel("family_workers.xlsx")
rural_pop0 <- read_excel("rural_pop.xlsx")
schooling_cost0 <- read_excel("schooling_cost.xlsx")
freedom0 <- read_excel("freedom.xlsx")
gini0 <- read_excel("gini_coefficient.xlsx")
gender0 <- read_excel("gender_ratio.xlsx")
```

```{r, warning=FALSE, echo=FALSE}
# Reassembling each dataset to longer style

primary_school <- primary_school0 %>%
  pivot_longer(cols = -country,
               names_to = "year",
               values_to="primary_school") %>%
  mutate(year = as.numeric(year))


urban_poverty <- urban_poverty0 %>%
  pivot_longer(cols = -country,
               names_to = "year",
               values_to="urban_poverty") %>%
    mutate(year = as.numeric(year))

unemployment <- unemployment0 %>%
  pivot_longer(cols = -country,
               names_to = "year",
               values_to="unemployment") %>%
    mutate(year = as.numeric(year))

gdp_capita <- gdp_capita0 %>%
  pivot_longer(cols = -country,
               names_to = "year",
               values_to="gdp_capita") %>%
    mutate(year = as.numeric(year))

child_mortality <- child_mortality0 %>%
  pivot_longer(cols = -country,
               names_to = "year",
               values_to="child_mortality") %>%
    mutate(year = as.numeric(year))

family_workers <- family_workers0 %>%
  pivot_longer(cols = -country,
               names_to = "year",
               values_to="family_workers") %>%
    mutate(year = as.numeric(year))

rural_pop <- rural_pop0 %>%
  pivot_longer(cols = -country,
               names_to = "year",
               values_to="rural_pop") %>%
    mutate(year = as.numeric(year))


schooling_cost <- schooling_cost0 %>%
  pivot_longer(cols = -country,
               names_to = "year",
               values_to="schooling_cost") %>%
    mutate(year = as.numeric(year))


freedom <- freedom0 %>%
  pivot_longer(cols = -country,
              names_to = "year",
              values_to="freedom") %>%
  mutate(year = as.numeric(year))


gini <- gini0 %>%
  pivot_longer(cols = -country,
              names_to = "year",
              values_to="gini") %>%
  mutate(year = as.numeric(year))


gender <- gender0 %>%
  pivot_longer(cols = -country,
              names_to = "year",
              values_to="gender_ratio") %>%
  mutate(year = as.numeric(year))
```

```{r, warning=FALSE, echo=FALSE}
# Joining every dataset to primary_school

primary_school1 <- primary_school %>%
  inner_join(urban_poverty, by = c("year","country")) %>%
  inner_join(unemployment, by = c("year","country")) %>%
  inner_join(gdp_capita, by = c("year","country")) %>%
  inner_join(child_mortality, by = c("year","country")) %>%
  inner_join(family_workers, by = c("year","country")) %>%
  inner_join(rural_pop, by = c("year","country")) %>%
  inner_join(schooling_cost, by = c("year","country")) %>%
  inner_join(freedom, by = c("year","country")) %>%
  inner_join(gini, by = c("year","country")) %>%
  inner_join(gender, by = c("year","country")) 
```

```{r, warning=FALSE, echo=FALSE}
# Adding continent variable

newdata <- data.frame(country=primary_school$country)
newdata$continent <- countrycode(sourcevar = primary_school$country, 
                                 origin= "country.name", destination= "continent")
newdata1 <- newdata %>%
  select(continent, country) %>%
  distinct()

primary_school <- primary_school1 %>%
  inner_join(newdata1, by="country") %>%
  mutate(decade = floor(year/10)*10) %>%
  drop_na(primary_school) 

View(primary_school)

# Suggestions: 1) make some variables categorical (category can be "missing value") 2) don't use variables with too many NAs 3) select range of years 4) find other new variables 
```

```{r, warning=FALSE, echo=FALSE}
save(primary_school,file="primary_school.Rda") 
#load it with load("primary_school.Rda")
```

# Graphs

## Relationship between Educational Attainment and GDP per Capita

## Relationship between Educational Attainment and GDP per Capita by Continent
```{r, warning=FALSE, echo=FALSE}
plot1 <- ggplot(primary_school, aes(x=gdp_capita, y=primary_school, color=continent)) + 
  geom_point() + 
  geom_smooth(method = "lm", se = FALSE) +
  labs(x="GDP per capita", y="Educational Attainment (Primary School)", 
       title="Educational Attainment and GDP per Capita by Continent")

ggplotly(plot1)
```

## Relationship between Educational Attainment and Child Mortality Rate by Continent
```{r, warning=FALSE, echo=FALSE}
plot2 <- ggplot(primary_school, aes(x=child_mortality, y=primary_school, color=continent))+
  geom_point() + 
  geom_smooth(method = "lm", se = FALSE) +
  scale_color_manual(values = wes_palette(n=5, name="Moonrise3")) +
  labs(x="Child Mortality Rate", y="Educational Attainment (Primary School)", 
       title="Educational Attainment and Child Mortality Rate by Continent")

ggplotly(plot2)
```

## Relationship between Educational Attainment and Number of Workers in Family
```{r, warning=FALSE, echo=FALSE}
plot3 <- ggplot(primary_school, aes(x=family_workers, y=schooling_cost, color=gdp_capita)) + 
  geom_point(size=2, alpha=0.7) + 
  scale_color_gradient(low = "red", high="blue") +
  labs(x="Number of Workers in Family", y="Educational Attainment (Primary School)", 
       title="Educational Attainment and Number of Workers in Family", color="GDP per Capita")

ggplotly(plot3)
```

## Timeline Graph 1 
```{r}
primary_school %>%
  group_by(continent, year) %>%
  summarize(average = mean(primary_school)) %>%
  ggplot(aes(x=year,y=average, color=continent)) + 
  geom_point() + 
  geom_line() + 
  labs(x="Year", y="Average Rate of Educational Attainment", 
       title = "Educational Attainment over Time by Continent")
```


## Map of World Educational Attainment
```{r}
world <- primary_school %>%
  distinct(country)

map.world <- map_data("world")

world %>% 
  ggplot() +
  geom_map(data = map.world, map = map.world, 
           aes(map_id = region), fill = "white", color = "black") + 
  geom_map(aes(map_id = country), map = map.world) +
  expand_limits(x = map.world$long, y = map.world$lat) + # scale for fill
  theme_map() 





worldmap <- get_stamenmap(
    bbox = c(left = -180, bottom = -57, right = 179, top = 82.1), 
    maptype = "toner-background",
    zoom = 2
)

ggmap(worldmap)
```



```{r}
primary_school %>%
  filter(decade == 1990) %>%
  group_by(decade, country) %>%
  summarize(average = mean(child_mortality)) %>%
  arrange(desc(average)) %>%
  filter(country==c("Niger", "Costa Rica")) 
```



# Tables
## Mean Educational Attainment by Continent
```{r, warning=FALSE, echo=FALSE}
primary_school %>%
  group_by(continent) %>%
  summarize(primary_school_mean = mean(primary_school)) %>%
  arrange(desc(primary_school_mean))
```

```{r}
primary_school <- primary_school %>%
  mutate(high_gdp = gdp_capita > 3955) 
```

```{r}
# scatterplots to find interactions between variables
ggplot(primary_school, aes(x=gdp_capita, y=unemployment, color=high_gdp)) + geom_point()


# linear regression
library(broom)
tidy(lm(primary_school ~ high_gdp + unemployment, data=primary_school))

tidy(lm(primary_school ~ gdp_capita + unemployment + child_mortality, data=primary_school))

tidy(lm(primary_school ~ gdp_capita + unemployment + rural_pop, data=primary_school))

# an interaction effect will let one variable affect the other. Doesn't mean that the two variables are related
# coeff 0 -> no relationship
# less variables to fit b/c NA

country_list <- primary_school %>% 
  select(1:1) %>%
  distinct() 

save(country_list,file="country_list.Rda") 
  
```


```{r}
## Map of World Educational Attainment
mapdata <- primary_school %>%
  arrange(desc(year)) %>%
  drop_na(gdp_capita) %>%
  group_by(country) %>%
  top_n(1, wt=year)


map.world <- map_data("world")

mapdata %>% 
  ggplot() +
  geom_map(data = map.world, map = map.world, 
           aes(map_id = region), fill = "white", color = "black") + 
  geom_map(aes(map_id = country), map = map.world) +
  geom_map(map = map.world, 
           aes(map_id = country, fill = high_gdp), color = "black") + 
  expand_limits(x = map.world$long, y = map.world$lat) + # scale for fill
  theme_map() 


worldmap <- get_stamenmap(
    bbox = c(left = -180, bottom = -57, right = 179, top = 82.1), 
    maptype = "toner-background",
    zoom = 2
)
ggmap(worldmap)


# chloropleth style by gdp or use high vs low gdp variables
# Pull most recent year for countries --> get rid of years with missing values, arrange_desc(year), 
# 5-6 plots are reasonable 
# Quality over quantity 
```


```{r}
primary_school %>%
  ggplot(aes(x=year, y=primary_school, color=high_gdp)) + 
  geom_jitter() +
  facet_wrap(~ high_gdp) +
  geom_smooth(se=FALSE) + 
  labs(x="Year", y="Educational Attainment", title = "Comparison of Educational Attainment between countries with high and low GDP per capita")
```
```{r}
primary_school %>%
  ggplot(aes(x=year, y=primary_school, color=high_gdp)) + 
  geom_jitter() +
  facet_wrap(~ continent) +
  geom_smooth(se=FALSE) + 
  labs(x="Year", y="Educational Attainment", title = "Comparison of Educational Attainment between countries with high and low GDP per capita")
```
# Table
```{r}
primary_school %>%
  group_by(continent) %>%
  summarize(mean_gdp = mean(gdp_capita)) %>%
  arrange(desc(mean_gdp))
```

```{r}
plot <- ggplot(primary_school, aes(x=gdp_capita, y=primary_school, color=high_gdp)) + 
  geom_point() + 
  geom_smooth(method = "lm", se = FALSE) +
  labs(x="GDP per capita", y="Educational Attainment (Primary School)", 
       title="Educational Attainment and GDP per Capita by Continent")

ggplotly(plot)

```

##Schooling Cost Graph

```{r}
ggplot(primary_school,aes(x=schooling_cost,y=primary_school, color=high_gdp))+
  geom_point()+
  geom_smooth(method="lm", se=FALSE)
```


## Gini Ratio
```{r}
ggplot(primary_school,aes(x=gini,y=primary_school, color=high_gdp))+
  geom_point()+
  geom_smooth(method="lm", se=FALSE)
```


